import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
# reading data as datframe from stored file
df = pd.read_csv('twitter_archive_master.csv')
# changing datatypes for some columns
df['tweet_id'] = df['tweet_id'].astype(str)
df['source'] = df['source'].astype('category')
df['favorites'] = df['favorites'].astype(int)
df['retweets'] = df['retweets'].astype(int)
df['stage'] = df['stage'].astype('category')
df['rating_numerator'] = df['rating_numerator'].astype(float)
df['rating_denominator'] = df['tweet_id'].astype(float)
df['dog_rating'] = df['dog_rating'].astype(float)
# extracting data
data = df.groupby('stage')['stage'].count()[1:]
# plotiing intferactive pie chat for exploring most common dog stage
import plotly.express as px
fig = px.pie(df, values=data,names = data.index, color_discrete_sequence=px.colors.diverging.Spectral, hole = .4)
fig.show()
The pie chart above is showing the percentage of dog stages in whe whole twitter archive. It seems Pupper is most common dog with 66.6 percent of all dog pictures. On the itehr hand, Doggofluffer and Doggopuppo is least common dog. It is important to note that all the 'None' values are ommited before plotting.
dog_rating_data = df[df['stage'] != 'None'][['stage','dog_rating']].value_counts().reset_index()
dog_rating_data.rename(columns = {0:'count'}, inplace = True)
fig = px.bar(dog_rating_data, x= 'stage', y= 'count', color = 'dog_rating')
fig.show()
#highest rating
dog_rating_data[dog_rating_data['dog_rating']==dog_rating_data['dog_rating'].max()]
| stage | dog_rating | count | |
|---|---|---|---|
| 21 | pupper | 2.7 | 1 |
We can decide that maximum number (>200)of rating has got by Pupper. Doggo has got more than 50 ratings where other dog breeds got less 20.Finally, Pupper has got highest rating which 2.7.
dog_prediction_data = df[df['stage'] != 'None'][['stage','prediction']].value_counts().reset_index()
dog_prediction_data.rename(columns = {0:'count'}, inplace = True)
dog_prediction_data = dog_prediction_data[dog_prediction_data['count']>=2]
fig = px.bar(dog_prediction_data, x= 'stage', y = 'count', color = 'prediction')
fig.show()
the prediction algorithom predicted Pupper mostly which is more than 100 times. Apart from that Golden retriver is most common dog breed in the whole datfarme
fig_sc = px.scatter(df, 'retweets', 'favorites', color= 'stage', trendline = 'ols')
fig_sc.show()
It seems retweets and favorirtes pocitively correlated. Doggo stage has highest tweets nad favorites But most retweets and favorites ranges between 0-40k.
fig_line = px.line(df, x='timestamp', y='favorites')
fig_line.show()
this trend plot is indicating an upward trend over time from January 2016 to July 2017.